In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import seaborn as sb
%matplotlib inline
import tabulate
from IPython.display import HTML, display
plt.rcParams["figure.figsize"] = (12, 9) # (w, h)
In [31]:
df = pd.read_json('./Lucas2018PointsWithWeatherDataSweden.json')
print(df.shape)
print(df.columns)
(1898, 39)
Index(['D0_CloudCover_Afternoon', 'D0_Humidity_Afternoon',
'D0_Pressure_Afternoon', 'D0_Percipitation_Total', 'D0_Temperature_Min',
'D0_Temperature_Max', 'D0_Temperature_Afternoon',
'D0_Temperature_Night', 'D0_Temperature_Evening',
'D0_Temperature_Morning', 'D0_MaxWind_Speed', 'D0_MaxWind_Direction',
'POINTID', 'Depth', 'pH_CaCl2', 'pH_H2O', 'EC', 'OC', 'CaCO3', 'P', 'N',
'K', 'OC_2030_cm', 'CaCO3_2030_cm', 'Ox_Al', 'Ox_Fe', 'NUTS_0',
'NUTS_1', 'NUTS_2', 'NUTS_3', 'TH_LAT', 'TH_LONG', 'SURVEY_DATE',
'Elev', 'LC', 'LU', 'LC0_Desc', 'LC1_Desc', 'LU1_Desc'],
dtype='object')
Selecting interesting cols only and checking for NA values¶
In [33]:
df = df[['D0_CloudCover_Afternoon', 'D0_Humidity_Afternoon',
'D0_Pressure_Afternoon', 'D0_Percipitation_Total', 'D0_Temperature_Min',
'D0_Temperature_Max', 'D0_Temperature_Afternoon',
'D0_Temperature_Night', 'D0_Temperature_Evening',
'TH_LAT', 'TH_LONG', 'D0_Temperature_Morning','pH_CaCl2', 'pH_H2O', 'EC', 'P', 'N','K']]
for col in ['P', 'N', 'K']:
df.loc[df[col]=='< LOD', col] = 0
df.loc[df[col]=='<0.0', col] = 0
df.loc[df[col]=='< 0.0', col] = 0
df[col]=df[col].astype(float)
In [34]:
for col in df.columns:
df[col]=df[col].astype(float)
df.dtypes
Out[34]:
D0_CloudCover_Afternoon float64 D0_Humidity_Afternoon float64 D0_Pressure_Afternoon float64 D0_Percipitation_Total float64 D0_Temperature_Min float64 D0_Temperature_Max float64 D0_Temperature_Afternoon float64 D0_Temperature_Night float64 D0_Temperature_Evening float64 TH_LAT float64 TH_LONG float64 D0_Temperature_Morning float64 pH_CaCl2 float64 pH_H2O float64 EC float64 P float64 N float64 K float64 dtype: object
In [35]:
df.head()
Out[35]:
| D0_CloudCover_Afternoon | D0_Humidity_Afternoon | D0_Pressure_Afternoon | D0_Percipitation_Total | D0_Temperature_Min | D0_Temperature_Max | D0_Temperature_Afternoon | D0_Temperature_Night | D0_Temperature_Evening | TH_LAT | TH_LONG | D0_Temperature_Morning | pH_CaCl2 | pH_H2O | EC | P | N | K | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 90.0 | 73.0 | 1013.0 | 3.70 | 283.359985 | 288.290009 | 286.059998 | 285.359985 | 286.019989 | 57.646988 | 12.961219 | 286.790009 | 3.8 | 4.11 | 61.90 | 27.3 | 19.2 | 292.7 |
| 1 | 68.0 | 65.0 | 1006.0 | 0.20 | 287.660004 | 292.329987 | 291.459991 | 288.970001 | 289.239990 | 59.424225 | 13.182595 | 289.549988 | 3.4 | 4.17 | 7.32 | 22.0 | 4.1 | 87.5 |
| 2 | 90.0 | 85.0 | 1010.0 | 3.60 | 284.649994 | 288.529999 | 286.880005 | 288.309998 | 284.649994 | 57.748528 | 13.238009 | 287.769989 | 3.1 | 4.06 | 5.77 | NaN | 2.6 | 63.7 |
| 3 | 20.0 | 77.0 | 1024.0 | 0.65 | 277.209991 | 285.339996 | 284.570007 | 282.799988 | 279.649994 | 56.525760 | 13.199473 | 282.600006 | 5.5 | 5.98 | 26.44 | 73.0 | 4.5 | 77.8 |
| 4 | 100.0 | 49.0 | 1029.0 | 0.00 | 270.049988 | 285.339996 | 284.959991 | 271.910004 | 281.200012 | 57.487030 | 14.215924 | 280.390015 | 4.2 | 4.62 | 5.17 | 21.1 | 1.8 | 38.5 |
In [36]:
corr=df.corr(method='spearman', numeric_only=True)
np.around(corr,2)
Out[36]:
| D0_CloudCover_Afternoon | D0_Humidity_Afternoon | D0_Pressure_Afternoon | D0_Percipitation_Total | D0_Temperature_Min | D0_Temperature_Max | D0_Temperature_Afternoon | D0_Temperature_Night | D0_Temperature_Evening | TH_LAT | TH_LONG | D0_Temperature_Morning | pH_CaCl2 | pH_H2O | EC | P | N | K | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| D0_CloudCover_Afternoon | 1.00 | 0.27 | -0.05 | 0.26 | -0.05 | -0.15 | -0.17 | -0.09 | -0.06 | 0.09 | 0.04 | -0.12 | -0.07 | -0.07 | -0.05 | -0.01 | -0.03 | -0.03 |
| D0_Humidity_Afternoon | 0.27 | 1.00 | -0.23 | 0.44 | -0.15 | -0.47 | -0.51 | -0.16 | -0.33 | 0.03 | 0.03 | -0.39 | -0.09 | -0.10 | -0.02 | -0.04 | -0.01 | -0.04 |
| D0_Pressure_Afternoon | -0.05 | -0.23 | 1.00 | -0.31 | 0.16 | 0.31 | 0.34 | 0.12 | 0.30 | -0.24 | -0.23 | 0.26 | 0.05 | 0.05 | 0.12 | 0.04 | 0.09 | 0.10 |
| D0_Percipitation_Total | 0.26 | 0.44 | -0.31 | 1.00 | 0.06 | -0.12 | -0.16 | 0.09 | -0.06 | -0.07 | -0.04 | -0.06 | -0.07 | -0.07 | 0.04 | -0.01 | 0.05 | 0.00 |
| D0_Temperature_Min | -0.05 | -0.15 | 0.16 | 0.06 | 1.00 | 0.81 | 0.80 | 0.92 | 0.87 | -0.42 | -0.20 | 0.86 | 0.06 | 0.07 | 0.26 | 0.10 | 0.21 | 0.19 |
| D0_Temperature_Max | -0.15 | -0.47 | 0.31 | -0.12 | 0.81 | 1.00 | 0.99 | 0.74 | 0.93 | -0.32 | -0.15 | 0.96 | 0.11 | 0.10 | 0.18 | 0.08 | 0.15 | 0.12 |
| D0_Temperature_Afternoon | -0.17 | -0.51 | 0.34 | -0.16 | 0.80 | 0.99 | 1.00 | 0.73 | 0.92 | -0.32 | -0.16 | 0.94 | 0.10 | 0.10 | 0.18 | 0.08 | 0.15 | 0.12 |
| D0_Temperature_Night | -0.09 | -0.16 | 0.12 | 0.09 | 0.92 | 0.74 | 0.73 | 1.00 | 0.75 | -0.42 | -0.20 | 0.82 | 0.07 | 0.07 | 0.25 | 0.08 | 0.20 | 0.18 |
| D0_Temperature_Evening | -0.06 | -0.33 | 0.30 | -0.06 | 0.87 | 0.93 | 0.92 | 0.75 | 1.00 | -0.35 | -0.17 | 0.92 | 0.09 | 0.10 | 0.21 | 0.09 | 0.17 | 0.15 |
| TH_LAT | 0.09 | 0.03 | -0.24 | -0.07 | -0.42 | -0.32 | -0.32 | -0.42 | -0.35 | 1.00 | 0.66 | -0.33 | -0.02 | -0.02 | -0.43 | -0.12 | -0.30 | -0.32 |
| TH_LONG | 0.04 | 0.03 | -0.23 | -0.04 | -0.20 | -0.15 | -0.16 | -0.20 | -0.17 | 0.66 | 1.00 | -0.10 | 0.12 | 0.09 | -0.24 | -0.06 | -0.19 | -0.19 |
| D0_Temperature_Morning | -0.12 | -0.39 | 0.26 | -0.06 | 0.86 | 0.96 | 0.94 | 0.82 | 0.92 | -0.33 | -0.10 | 1.00 | 0.10 | 0.10 | 0.20 | 0.09 | 0.16 | 0.12 |
| pH_CaCl2 | -0.07 | -0.09 | 0.05 | -0.07 | 0.06 | 0.11 | 0.10 | 0.07 | 0.09 | -0.02 | 0.12 | 0.10 | 1.00 | 0.96 | -0.00 | -0.08 | -0.25 | -0.16 |
| pH_H2O | -0.07 | -0.10 | 0.05 | -0.07 | 0.07 | 0.10 | 0.10 | 0.07 | 0.10 | -0.02 | 0.09 | 0.10 | 0.96 | 1.00 | -0.05 | -0.06 | -0.26 | -0.14 |
| EC | -0.05 | -0.02 | 0.12 | 0.04 | 0.26 | 0.18 | 0.18 | 0.25 | 0.21 | -0.43 | -0.24 | 0.20 | -0.00 | -0.05 | 1.00 | 0.26 | 0.83 | 0.74 |
| P | -0.01 | -0.04 | 0.04 | -0.01 | 0.10 | 0.08 | 0.08 | 0.08 | 0.09 | -0.12 | -0.06 | 0.09 | -0.08 | -0.06 | 0.26 | 1.00 | 0.20 | 0.36 |
| N | -0.03 | -0.01 | 0.09 | 0.05 | 0.21 | 0.15 | 0.15 | 0.20 | 0.17 | -0.30 | -0.19 | 0.16 | -0.25 | -0.26 | 0.83 | 0.20 | 1.00 | 0.76 |
| K | -0.03 | -0.04 | 0.10 | 0.00 | 0.19 | 0.12 | 0.12 | 0.18 | 0.15 | -0.32 | -0.19 | 0.12 | -0.16 | -0.14 | 0.74 | 0.36 | 0.76 | 1.00 |
In [37]:
mask = np.zeros_like(corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True
hm=sb.heatmap(
corr,
vmin=-1, vmax=1, center=0,
#cmap=sb.diverging_palette(20, 220, n=200),
cmap='coolwarm',
square=False,
annot=True,
mask=mask
)
hm.set_xticklabels(
hm.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
In [38]:
sb.pairplot(
df,
diag_kind='kde',
plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
height=2
)
Out[38]:
<seaborn.axisgrid.PairGrid at 0x1b519205c50>